Code
library(tidyverse)
library(ggplot2)
library(tidymodels)
Tony Duan
October 12, 2023
from https://www.kaggle.com/c/titanic/data
Rows: 891
Columns: 12
$ PassengerId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass <dbl> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp <dbl> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin <chr> NA, "C85", NA, "C123", NA, NA, "E46", NA, NA, NA, "G6", "C…
$ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
[1] 891 13
# Bootstrap sampling
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [891/328]> Bootstrap01
2 <split [891/338]> Bootstrap02
3 <split [891/318]> Bootstrap03
4 <split [891/316]> Bootstrap04
5 <split [891/328]> Bootstrap05
6 <split [891/317]> Bootstrap06
7 <split [891/330]> Bootstrap07
8 <split [891/339]> Bootstrap08
9 <split [891/322]> Bootstrap09
10 <split [891/317]> Bootstrap10
# declare recipe
titanic_recipe <-
recipe(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = train_df) %>% # keep variables we want
step_impute_median(Age,Fare) %>% # imputation
step_impute_mode(Embarked) %>% # imputation
step_mutate_at( Pclass, Sex, Embarked, fn = factor) %>% # make these factors
step_mutate(Travelers = SibSp + Parch + 1) %>% # new variable
step_rm(SibSp, Parch) %>% # remove variables
step_dummy(all_nominal_predictors()) %>% # create indicator variables
# normalize numerical variables
step_normalize(all_numeric_predictors()) %>% prep()
summary(titanic_recipe)
# A tibble: 9 × 4
variable type role source
<chr> <list> <chr> <chr>
1 Age <chr [2]> predictor original
2 Fare <chr [2]> predictor original
3 Survived <chr [3]> outcome original
4 Travelers <chr [2]> predictor derived
5 Pclass_X2 <chr [2]> predictor derived
6 Pclass_X3 <chr [2]> predictor derived
7 Sex_male <chr [2]> predictor derived
8 Embarked_Q <chr [2]> predictor derived
9 Embarked_S <chr [2]> predictor derived
# logistic regression
titanic_glm_spec <-
logistic_reg() %>% # model
set_engine('glm') %>% # package to use
set_mode('classification') # choose one of two: classification vs regresson
# random forest
titanic_rf_spec <-
rand_forest(trees = 200) %>% # algorithm speicfic argument:200 trees
set_engine('ranger') %>%
set_mode('classification')
# svm
titanic_svm_spec <-
svm_rbf() %>% # rbf - radial based
set_engine('kernlab') %>%
set_mode('classification')
# logistic regression
doParallel::registerDoParallel() # resample fitting is embarrasingly parrallel problem
titanic_glm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_glm_spec)
# random forest
doParallel::registerDoParallel()
titanic_rf_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
# svm
doParallel::registerDoParallel()
titanic_svm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_svm_spec)
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.793 10 0.00286 Preprocessor1_Model1
2 roc_auc binary 0.842 10 0.00640 Preprocessor1_Model1
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.822 10 0.00456 Preprocessor1_Model1
2 roc_auc binary 0.865 10 0.00554 Preprocessor1_Model1
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.811 10 0.00472 Preprocessor1_Model1
2 roc_auc binary 0.833 10 0.00331 Preprocessor1_Model1
It seems that Random Forest is the winner with 82% accuracy and ROCAUC of 86.5. We use it as a final fit to the whole training data.
# A tibble: 6 × 2
PassengerId Survived
<dbl> <fct>
1 892 0
2 893 0
3 894 0
4 895 0
5 896 1
6 897 0
https://www.kaggle.com/c/titanic/data
https://rpubs.com/tsadigov/titanic_tidymodels
---
title: "tidymodels 2 with bootstrap resamples and workflow"
author: "Tony Duan"
date: "2023-10-12"
categories: [R]
execute:
warning: false
error: false
format:
html:
toc: true
toc-location: left
code-fold: show
code-tools: true
number-sections: true
code-block-bg: true
code-block-border-left: "#31BAE9"
---
{width="400"}
# package
```{r}
library(tidyverse)
library(ggplot2)
library(tidymodels)
```
# data
from https://www.kaggle.com/c/titanic/data
```{r}
pred <- c("Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "title")
train_df_raw <- read_csv('data/train.csv')
test_df_raw <- read_csv('data/test.csv')
glimpse(train_df_raw)
```
## train data
```{r}
train_df=train_df_raw %>%
mutate(Survived=as.factor(Survived),
title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
)
#train_df=train_df %>% select(c(all_of(pred),"Survived"))
dim(train_df)
```
```{r}
train_df %>% count(Survived)
```
```{r}
342/(549+342)
```
## test data
```{r}
test_df=test_df_raw %>%
mutate(
title = str_trim(str_replace(str_extract(Name, ", [A-Z]+[A-Za-z.]*[:space:]+"), ",", ""))
)
#test_df=test_df %>% select(c(all_of(pred)))
dim(test_df)
```
## bootstrap (re)samples for model selection
```{r}
set.seed(2022)
titanic_folds <- bootstraps(data = train_df,
times = 10)
titanic_folds
```
# model
## recipe
```{r}
# declare recipe
titanic_recipe <-
recipe(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = train_df) %>% # keep variables we want
step_impute_median(Age,Fare) %>% # imputation
step_impute_mode(Embarked) %>% # imputation
step_mutate_at( Pclass, Sex, Embarked, fn = factor) %>% # make these factors
step_mutate(Travelers = SibSp + Parch + 1) %>% # new variable
step_rm(SibSp, Parch) %>% # remove variables
step_dummy(all_nominal_predictors()) %>% # create indicator variables
# normalize numerical variables
step_normalize(all_numeric_predictors()) %>% prep()
summary(titanic_recipe)
```
```{r}
juice_titanic_recipe=juice(titanic_recipe)
```
## model
```{r}
# logistic regression
titanic_glm_spec <-
logistic_reg() %>% # model
set_engine('glm') %>% # package to use
set_mode('classification') # choose one of two: classification vs regresson
# random forest
titanic_rf_spec <-
rand_forest(trees = 200) %>% # algorithm speicfic argument:200 trees
set_engine('ranger') %>%
set_mode('classification')
# svm
titanic_svm_spec <-
svm_rbf() %>% # rbf - radial based
set_engine('kernlab') %>%
set_mode('classification')
```
## workflow
```{r}
# logistic regression
doParallel::registerDoParallel() # resample fitting is embarrasingly parrallel problem
titanic_glm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_glm_spec)
# random forest
doParallel::registerDoParallel()
titanic_rf_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
# svm
doParallel::registerDoParallel()
titanic_svm_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_svm_spec)
```
## trainning
```{r}
glm_model_fit=titanic_glm_wf%>% fit_resamples(titanic_folds)
rf_model_fit=titanic_rf_wf%>% fit_resamples(titanic_folds)
svm_model_fit=titanic_svm_wf%>% fit_resamples(titanic_folds)
```
# result
```{r}
collect_metrics(glm_model_fit)
```
```{r}
collect_metrics(rf_model_fit)
```
```{r}
collect_metrics(svm_model_fit)
```
## last fit
It seems that Random Forest is the winner with 82% accuracy and ROCAUC of 86.5. We use it as a final fit to the whole training data.
```{r}
#random forest workflow
titanic_rf_last_wf <-
workflow() %>%
add_recipe(titanic_recipe) %>%
add_model(titanic_rf_spec)
```
```{r}
# last fit
final_fit <-
fit(object = titanic_rf_last_wf,
data = train_df)
```
```{r}
#result
final_fit %>%
extract_recipe(estimated = T)
```
# predictions
```{r}
test_pred <-
final_fit %>%predict(test_df)
```
```{r}
final_result=test_pred %>% bind_cols(test_df) %>%
select(PassengerId, .pred_class) %>%
rename(Survived=.pred_class)
```
```{r}
head(final_result)
```
```{r}
final_result %>% count(Survived)
```
```{r}
119/(119+299)
```
# Reference
https://www.kaggle.com/c/titanic/data
https://rpubs.com/tsadigov/titanic_tidymodels